Introduction

Load libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(tidyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(stringr)
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(RSentiment)
#trace("calculate_score",edit=TRUE)

Initialize the dataset

source(file="Data/Analyze_Training_Sets.R")
source(file="Data/merge_data.R")

Exploratory data analysis and Enriching the dataset

Word count

Suicidal posts are more wordy than non-suicidal ones, so this looks like a good predictor.

wordcount <- function(str) {
  sapply(gregexpr("\\b\\W+\\b", str, perl=TRUE), function(x) sum(x>0) ) + 1 
}

complete<-complete %>%
  mutate(wc = wordcount(text))

save(complete,file = "complete.RData")

Compare sentiment between suicidal and non-suicidal texts

I tried more complex sentiments, using the nrc library. The main differences seemed to be in positive and negative sentiments, though.

load(file = "complete.RData")

#Get the tokens out of the posts
post.tok <- complete %>% 
  mutate(linenumber=row_number()) %>%
  unnest_tokens(word,text) 

nrc <- sentiments %>%
  filter(lexicon == "nrc") %>%
  dplyr::select(word, sentiment)

#Label the words with the sentiments using an inner join
#with the nrc sentiments
post.sent = post.tok %>% inner_join(nrc)
## Joining, by = "word"
#Make a table of the sentiments
# Table - only 10 sentiments in posts from
#suicidal users
sui.sent <- post.sent %>%
  filter(suicidal=='suicidal') %>%
  group_by(sentiment) %>%
  summarize(n=n()) %>%
  filter(n>10) %>%  
  arrange(desc(n))

# Table - 10 most common sentiments in posts NOT in r/depression 
# from from suicidal users
not.sui.sent <- post.sent %>% 
  filter(suicidal=='not suicidal') %>%
  group_by(sentiment) %>%
  summarize(n=n()) %>%
  filter(n>10) %>%  
  arrange(desc(n))

#not.sui.sent[1:10,]

comparison <- sui.sent %>%
  rename(sui = n) %>%
  inner_join(not.sui.sent,by="sentiment") %>%
  rename(not.sui = n) %>%
  mutate(sui = sui / sum(sui),
         not.sui = not.sui / sum(not.sui),diff=sui-not.sui) %>%
  arrange(diff)

sui.sent.for.merge <- comparison[1:10,1:2] %>%
  mutate(suicidal = "suicidal")
not.sui.sent.for.merge <- comparison[1:10,c(1,3)] %>%
  mutate(suicidal = "not suicidal")

colnames(sui.sent.for.merge)<-c("sentiment","value","suicidal")
colnames(not.sui.sent.for.merge)<-c("sentiment","value","suicidal")

plot.sentiments <-rbind(sui.sent.for.merge,not.sui.sent.for.merge)

plot.sentiments$sentiment <- factor(plot.sentiments$sentiment, 
                                    levels = plot.sentiments$sentiment[order(comparison[1:10,]$diff)])
plot.sentiments$sentiment  # notice the changed order of factor levels
##  [1] positive     anticipation joy          trust        surprise    
##  [6] anger        disgust      fear         sadness      negative    
## [11] positive     anticipation joy          trust        surprise    
## [16] anger        disgust      fear         sadness      negative    
## 10 Levels: positive anticipation joy trust surprise anger disgust ... negative
ggplot(plot.sentiments, aes(sentiment, value)) +
  geom_bar(aes(fill = suicidal), position = "dodge", stat="identity")

Simple sentiment scores

Simple sentiments (positive, negative, neutral)

load(file = "complete.RData")

#Make a table of the sentiments
# Table - only 10 sentiments in posts from
#suicidal users
complete <- complete %>%
  mutate(score = calculate_score(text), post.id = row_number())

save(complete,file="complete.RData")
load(file="complete.RData")

#Plot the sentiment range
#hist(complete2$score)

ggplot(complete, aes(post.id, score, color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")+
  labs(title = "Sentiment score in mental health subreddits")+
  theme(legend.position = c(0.2, 0.15))

Readability

Suicidal posts are less readable than non-suicidal ones (higher reading age/grade-level), so this looks like a good predictor.

#### Linguistic Features ###

#Readability
#install.packages('koRpus')
#install.packages('tm')
library(koRpus)
## 
## Attaching package: 'koRpus'
## The following object is masked from 'package:dplyr':
## 
##     query
#library(tm)

#get my list of source files.
load(file="complete.RData")

#Write each post to its own file
n <-length(complete$text)

filepaths <-rep("",n)

for (i in 1:n){
  str <-complete$text[i]
  text<- paste(str,i,sep="")
  filename <-paste("str/str",i,".txt",sep="")
  filepaths[i]<-filename
  write(str,file=filename)
}

#list of kRp.tagged object using tokenize which is the default tagger
#given with the koRpus package
ll.tagged <- lapply(filepaths, tokenize, lang="en")

#Once I have my list of "tagged" objects I can get flesch-kincaid readability, in age
ll.flesch <- lapply(ll.tagged,flesch.kincaid,quiet=TRUE)
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!

## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
age <-rep(0,n)

#

#Now write all those to a .txt file
for (i in 1:n){
  age[i] <-attr(ll.flesch[[i]], which="Flesch.Kincaid")$age
}

complete <- complete %>%
  mutate(age = age)

#Write complete data to file
save(complete,file = "complete.RData")

load(file="complete.RData")
#complete2 <- complete2 %>% 

ggplot(complete, aes(post.id, age, color=factor(suicidal)))+
  geom_point() + 
  #  scale_color_brewer(palette="Set1")  +
  labs(title = "Reading age of posts and
       comments in mental health subreddits")

#Looks like the readability of the suicidal posts is 15 or above?
#almost def above 30
#most are under 15 for both categories

Word choice

For this section, note that value is percentage of the text that contains that word (e.g., the text “cat dog cat” is 66.7% “cat”).

Part 1: Stop words
Part 1.A : Non-suicidal words
load("complete.RData")

#Get the tokens out of the posts
post.tok <- complete %>% 
  mutate(linenumber=row_number()) %>%
  unnest_tokens(word,text) 

###Part 1: Stop words

###Part 1.A : Non-suicidal words
data("stop_words")
tidy.post.tok <- post.tok

#Get counts for each
suicidal.post.tok <- tidy.post.tok %>%
  filter(suicidal=='suicidal') %>%
  group_by(word) %>%
  summarize(n=n()) %>%
  arrange(desc(n))

not.suicidal.post.tok <- tidy.post.tok %>%
  filter(suicidal=='not suicidal') %>%
  group_by(word) %>%
  summarize(n=n()) %>%
  arrange(desc(n))

comparison <- suicidal.post.tok %>%
  rename(sui = n) %>%
  inner_join(not.suicidal.post.tok,by="word") %>%
  rename(not.sui = n) %>%
  mutate(sui = sui / sum(sui),
         not.sui = not.sui / sum(not.sui),diff=sui-not.sui) %>%
  arrange(diff)

#The words used by the depression NON-suicidal users is interesting.
#We could use these as predictors for the logistic regression.
head(comparison)
## # A tibble: 6 × 4
##    word         sui     not.sui         diff
##   <chr>       <dbl>       <dbl>        <dbl>
## 1   you 0.004815952 0.015738152 -0.010922199
## 2  your 0.001129584 0.004427475 -0.003297891
## 3     a 0.020663952 0.023517602 -0.002853650
## 4    is 0.008218233 0.010504051 -0.002285818
## 5   are 0.002360628 0.004373700 -0.002013072
## 6   can 0.002915274 0.004588800 -0.001673526
#The words most used by the suicidal users isn't useful,
#since we identify them by the words "die", "kill", etc.,
#so it's unsurprising the words in the phrases we used to 
#appear the most frequently and don't appear much in the non-suicidal group.
tail(comparison)
## # A tibble: 6 × 4
##    word         sui     not.sui        diff
##   <chr>       <dbl>       <dbl>       <dbl>
## 1    to 0.037993263 0.035473579 0.002519685
## 2   and 0.032095074 0.029325303 0.002769772
## 3  want 0.006567822 0.002921775 0.003646047
## 4    me 0.015347466 0.010611601 0.004735864
## 5    my 0.022564630 0.014106976 0.008457653
## 6     i 0.068160604 0.050261705 0.017898899
range(comparison$diff) 
## [1] -0.0109222  0.0178989
#Plot the difference in word use as graphs
plot.words <- melt(head(comparison)) %>% filter(variable!="diff")
## Using word as id variables
plot.words$word <- factor(plot.words$word, 
                          levels = plot.words$word[order(comparison[1:10,]$diff)])
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
plot.words$word  # notice the changed order of factor levels
##  [1] you  your a    is   are  can  you  your a    is   are  can 
## Levels: you your a is are can you your a is
ggplot(plot.words, aes(word, value))+
  geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
  labs(title = "Word choice in posts and comments in mental health subreddits")+
  theme(legend.position = c(0.86, 0.85))
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

#Now write all those to a column
n<-length(complete$text)

#these words are wrong, fix them
a_word<-rep(0,n) 
is_word<-rep(0,n)
are_word<-rep(0,n)
can_word<-rep(0,n)

for (i in 1:n) {
  a_word[i] <-str_count(complete$text[i],"a") + str_count(complete$text[i],"a") 
  is_word[i] <-str_count(complete$text[i]," is ") + str_count(complete$text[i],"Is ")
  are_word[i] <-str_count(complete$text[i]," are ") + str_count(complete$text[i]," Are ") 
  can_word[i] <-str_count(complete$text[i]," can ") + str_count(complete$text[i],"Can ") 
}

complete<- cbind(complete,a_word,is_word,are_word,
                 can_word)
save(complete,file = "complete.RData")
load(file="complete.RData")

Part1.B Suicidal users’ stop words

#Plot the difference in word use as graphs

sui.words <-tail(comparison)
plot.words2 <- melt(sui.words) %>% filter(variable!="diff")
## Using word as id variables
plot.words2$word <- factor(plot.words2$word, 
                           levels = plot.words2$word[order(sui.words$diff)])

plot.words2$word  # notice the changed order of factor levels
##  [1] to   and  want me   my   i    to   and  want me   my   i   
## Levels: to and want me my i
ggplot(plot.words2, aes(word, value))+
  geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
  labs(title = "Word choice in posts and comments in mental health subreddits")+
  theme(legend.position = c(0.16, 0.85))

range(sui.words$diff)
## [1] 0.002519685 0.017898899
#Now write all those to a column
n<-length(complete$text)

to_word<-rep(0,n)
and_word<-rep(0,n)
want_word<-rep(0,n)

for (i in 1:n) {
  to_word[i] <-str_count(complete$text[i]," to ") + str_count(complete$text[i]," To ") 
  and_word[i] <-str_count(complete$text[i]," and ") + str_count(complete$text[i],"And ") 
  want_word[i] <-str_count(complete$text[i],"want") 
}

complete<- cbind(complete,to_word,and_word,want_word)
Part 2: Non-stop words
Part 2.A : Non-suicidal words
#get rid of stop words
data("stop_words")
tidy.post.tok <- post.tok %>%
  anti_join(stop_words)
## Joining, by = "word"
#Get counts for each
suicidal.post.tok <- tidy.post.tok %>%
  filter(suicidal=='suicidal') %>%
  group_by(word) %>%
  summarize(n=n()) %>%
  arrange(desc(n))

not.suicidal.post.tok <- tidy.post.tok %>%
  filter(suicidal=='not suicidal') %>%
  group_by(word) %>%
  summarize(n=n()) %>%
  arrange(desc(n))

comparison <- suicidal.post.tok %>%
  rename(sui = n) %>%
  inner_join(not.suicidal.post.tok,by="word") %>%
  rename(not.sui = n) %>%
  mutate(sui = sui / sum(sui),
         not.sui = not.sui / sum(not.sui),diff=sui-not.sui) %>%
  arrange(diff)

#The words used by the depression NON-suicidal users is interesting.
#We could use these as predictors for the logistic regression.
head(comparison)
## # A tibble: 6 × 4
##       word          sui     not.sui         diff
##      <chr>        <dbl>       <dbl>        <dbl>
## 1   people 0.0126053905 0.016809653 -0.004204263
## 2   person 0.0041741534 0.006207714 -0.002033561
## 3   advice 0.0008845888 0.002510986 -0.001626397
## 4     hope 0.0018797512 0.003347981 -0.001468230
## 5 positive 0.0005805114 0.002022738 -0.001442227
## 6  anxiety 0.0031789910 0.004533724 -0.001354733
#The words most used by the suicidal users isn't useful,
#since we identify them by the words "die", "kill", etc.,
#so it's unsurprising the words in the phrases we used to 
#appear the most frequently and don't appear much in the non-suicidal group.
tail(comparison)
## # A tibble: 6 × 4
##      word         sui     not.sui        diff
##     <chr>       <dbl>       <dbl>       <dbl>
## 1    dont 0.004284727 0.000697496 0.003587231
## 2 fucking 0.005860401 0.001673990 0.004186410
## 3 anymore 0.006330339 0.001394992 0.004935347
## 4    life 0.018852799 0.012833926 0.006018873
## 5     die 0.007767795 0.001325242 0.006442553
## 6    kill 0.007242571 0.000697496 0.006545075
range(comparison$diff) 
## [1] -0.004204263  0.006545075
#Plot the difference in word use as graphs
plot.words <- melt(head(comparison)) %>% filter(variable!="diff")
## Using word as id variables
plot.words$word <- factor(plot.words$word, 
                          levels = plot.words$word[order(comparison[1:10,]$diff)])
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
plot.words$word  # notice the changed order of factor levels
##  [1] people   person   advice   hope     positive anxiety  people  
##  [8] person   advice   hope     positive anxiety 
## 10 Levels: people person advice hope positive anxiety people ... hope
ggplot(plot.words, aes(word, value))+
  geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
  labs(title = "Word choice in posts and comments in mental health subreddits")+
  theme(legend.position = c(0.86, 0.85))
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

#Now write all those to a column
n<-length(complete$text)

#these words are wrong, fix them
people_word<-rep(0,n)
person_word<-rep(0,n) 
hope_word<-rep(0,n) 
advice_word<-rep(0,n)
positive_word<-rep(0,n)
anxiety_word<-rep(0,n)

for (i in 1:n) {
  people_word[i] <-str_count(complete$text[i],"people") 
  person_word[i] <-str_count(complete$text[i],"person") 
  hope_word[i] <-str_count(complete$text[i],"hope") 
  advice_word[i] <-str_count(complete$text[i],"advice") 
  positive_word[i] <-str_count(complete$text[i],"positive") 
  anxiety_word[i] <-str_count(complete$text[i],"anxiety") 
}

complete<- cbind(complete, people_word,person_word,hope_word,advice_word,
                 positive_word,anxiety_word)
save(complete,file = "complete.RData")
Part 2.B Suicidal users’ words
#Plot the difference in word use as graphs

sui.words <-tail(comparison)
plot.words2 <- melt(sui.words) %>% filter(variable!="diff")
## Using word as id variables
#sui.words<-plot.words2
#sui.words

#desc(sui.words$diff)
plot.words2$word <- factor(plot.words2$word, 
                           levels = plot.words2$word[order(sui.words$diff)])

plot.words2$word  # notice the changed order of factor levels
##  [1] dont    fucking anymore life    die     kill    dont    fucking
##  [9] anymore life    die     kill   
## Levels: dont fucking anymore life die kill
ggplot(plot.words2, aes(word, value))+
  geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
  labs(title = "Word choice in posts and comments in mental health subreddits")+
  theme(legend.position = c(0.16, 0.85))

range(sui.words$diff)
## [1] 0.003587231 0.006545075
#Now write all those to a column
n<-length(complete$text)

kill_word<-rep(0,n)
die_word<-rep(0,n)
anymore_word<-rep(0,n)
life_word<-rep(0,n)
fucking_word <-rep(0,n)
dont_word <-rep(0,n)

for (i in 1:n) {
  kill_word[i] <-str_count(complete$text[i],"kill") 
  die_word[i] <-str_count(complete$text[i]," die ") +
    str_count(complete$text[i]," die.") +
    str_count(complete$text[i]," die!") +
    str_count(complete$text[i]," die?") +
    str_count(complete$text[i]," died") + 
    str_count(complete$text[i]," DIE")
  life_word[i] <-str_count(complete$text[i],"life") 
  anymore_word[i] <-str_count(complete$text[i],"anymore") 
  fucking_word[i] <-str_count(complete$text[i],"fucking") 
  dont_word[i] <-str_count(complete$text[i],"don't") 
}

complete<- cbind(complete, kill_word,die_word,anymore_word,
                 life_word,fucking_word,dont_word)

save(complete,file = "complete.RData")

Pronoun Use/Higher Self-Attentional Focus

Suicidal people exhibit more self-attentional focus, so their pronoun use looks to be of interest.

1st person
#2. Higher self-attentional focus
#Pronouns
load(file="complete.RData")
#Now write all those to a column
n<-length(complete$text)
first_pronouns<-rep(0,n)

for (i in 1:n) {
  first_pronouns[i] <-str_count(complete$text[i]," I ") +
    str_count(complete$text[i]," i ") +
    str_count(complete$text[i],"I’m") +
    str_count(complete$text[i],"I‘d") +
    str_count(complete$text[i],"I‘ll") +
    str_count(complete$text[i],"I’ve") +
    str_count(complete$text[i]," me ") +
    str_count(complete$text[i]," me.") +
    str_count(complete$text[i]," me?") +
    str_count(complete$text[i]," me!") +
    str_count(complete$text[i]," my ") +
    str_count(complete$text[i]," My ")
}

complete<- cbind(complete, first_pronouns)

save(complete,file = "complete.RData")

ggplot(complete, aes(post.id, first_pronouns,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "First-person pronouns in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

2nd person
#2nd person
n<-length(complete$text)
sec_pronouns<-rep(0,n)

for (i in 1:n) {
  sec_pronouns[i] <-str_count(complete$text[i]," you ") + str_count(complete$text[i],"You") +
    str_count(complete$text[i]," you’re ") + str_count(complete$text[i],"You're") +
    str_count(complete$text[i],"you’d") + str_count(complete$text[i],"You'd") +
    str_count(complete$text[i],"you’ll") + str_count(complete$text[i],"You'll") +
    str_count(complete$text[i],"you’ve") + str_count(complete$text[i],"You've") +
    str_count(complete$text[i],"your") + str_count(complete$text[i],"Your") +
    str_count(complete$text[i],"yours")
}

complete<- cbind(complete, sec_pronouns)

save(complete,file = "complete.RData")

ggplot(complete, aes(post.id, sec_pronouns,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Second-person pronouns in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

3rd person
#3rd person
source(file="pronoun_dict.R")
third.pronouns <-as.data.frame(word.list[3])
colnames(third.pronouns) <-"word"

#suicidal
sui.post.tok <- complete %>% 
  filter(suicidal=='suicidal') %>%
  unnest_tokens(word,text) 

sui.pronoun.post.tok <- sui.post.tok %>%
  group_by(word) %>%
  summarize(n=n()) %>%
  arrange(desc(n))

sui.third = inner_join(sui.post.tok, third.pronouns,by="word")
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
sui.results <- sui.third %>%
  group_by(word) %>%
  summarize(n=n()) %>%
  arrange(desc(n))

#Not suicidal
not.sui.post.tok <- complete %>% 
  filter(suicidal=='not suicidal') %>%
  unnest_tokens(word,text) 

not.sui.pronoun.post.tok <- not.sui.post.tok %>%
  group_by(word) %>%
  summarize(n=n()) %>%
  arrange(desc(n))

not.sui.third = inner_join(not.sui.pronoun.post.tok, third.pronouns,by="word")
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
not.sui.results <- not.sui.third %>%
  group_by(word) %>%
  summarize(n=n()) %>%
  arrange(desc(n))

third.comparison <- sui.results %>%
  rename(sui = n) %>%
  inner_join(not.sui.results,by="word") %>%
  rename(not.sui = n) %>%
  mutate(sui = sui / sum(sui),
         not.sui = not.sui / sum(not.sui),diff=sui-not.sui) %>%
  arrange(diff)

#Plot the difference in word use as graphs
library(reshape2)
#plot.words <- melt(comparison[1:10,]) %>% filter(variable!="diff")
plot.words <- melt(third.comparison) %>% filter(variable!="diff")
## Using word as id variables
plot.words$word <- factor(plot.words$word, 
                          levels = plot.words$word[order(third.comparison$diff)])
plot.words$word  # notice the changed order of factor levels
##  [1] hers  it’s  its   his   their him   them  he    they  her   it   
## [12] hers  it’s  its   his   their him   them  he    they  her   it   
## Levels: hers it’s its his their him them he they her it
ggplot(plot.words, aes(word, value))+
  geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
  labs(title = "Word choice in posts and comments in mental health subreddits")+
  theme(legend.position = c(0.86, 0.85))

#it looks predictive
#her and it words predict suicidal thoughts, all other pronouns are not suicidal
her_word<-rep(0,nrow(complete))
for (i in 1:n) {
  her_word[i] <-
    str_count(complete$text[i]," her ") +     str_count(complete$text[i]," Her ") +
    str_count(complete$text[i]," her.") +     str_count(complete$text[i],"her!") +
    str_count(complete$text[i]," her? ")
}

it_word<-rep(0,nrow(complete))
for (i in 1:n) {
  it_word[i] <-str_count(complete$text[i]," I ") +
    str_count(complete$text[i]," it ") +     str_count(complete$text[i],"It ") +
    str_count(complete$text[i]," it. ") +     str_count(complete$text[i],"it!") +
    str_count(complete$text[i]," it? ")
}

third_pronouns<-rep(0,nrow(complete))
for (i in 1:n) {
  third_pronouns[i]<-
    str_count(complete$text[i]," hers ") + str_count(complete$text[i],"Her ") +
    str_count(complete$text[i],"it's") + str_count(complete$text[i],"It's") +
    str_count(complete$text[i]," its ") + str_count(complete$text[i],"Its") +
    str_count(complete$text[i]," his ") +     str_count(complete$text[i]," His ") + 
    str_count(complete$text[i],"their") +     str_count(complete$text[i],"Their") + 
    str_count(complete$text[i]," he ") +     str_count(complete$text[i],"He ") + 
    str_count(complete$text[i]," him") +
    str_count(complete$text[i],"them") +
    str_count(complete$text[i],"they") +     str_count(complete$text[i],"They")
}

complete<-cbind(complete,third_pronouns,it_word,her_word)
save(complete,file="complete.RData")
load("complete.RData")

ggplot(complete, aes(post.id, it_word,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Use of 'it' in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

ggplot(complete, aes(post.id, third_pronouns,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Third-person pronouns in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

ggplot(complete, aes(post.id, her_word,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Use of word 'her' in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

Triggers

These were chosen based on the subject matter of the hand classified posts.

girl_word<-rep(0,nrow(complete))
for (i in 1:n) {
  girl_word[i]<-str_count(complete$text[i],"girl") + str_count(complete$text[i],"Girl")
}

family_words<-rep(0,nrow(complete))
for (i in 1:n) {
  family_words[i]<- str_count(complete$text[i],"Mom") + str_count(complete$text[i],"mom") + 
    str_count(complete$text[i],"dad") + str_count(complete$text[i],"Dad") + 
    str_count(complete$text[i],"parents") + str_count(complete$text[i],"family") +
    str_count(complete$text[i],"brother") + str_count(complete$text[i],"sister") +
    str_count(complete$text[i],"cousin")
}

job_words<-rep(0,nrow(complete))
for (i in 1:n) {
  job_words[i]<- str_count(complete$text[i],"job") + str_count(complete$text[i],"employ") 
}

friend_words<-rep(0,nrow(complete))
for (i in 1:n) {
  friend_words[i]<- str_count(complete$text[i],"friend") + str_count(complete$text[i],"Friend")
}

lone_words<-rep(0,nrow(complete))
for (i in 1:n) {
  lone_words[i]<- str_count(complete$text[i],"lone") +
    str_count(complete$text[i],"no one") + str_count(complete$text[i],"No one")
}

therapy_words<-rep(0,nrow(complete))
for (i in 1:n) {
  therapy_words[i]<- str_count(complete$text[i],"psychiatr") + str_count(complete$text[i],"Psychiatr") 
    str_count(complete$text[i],"therap") + str_count(complete$text[i],"Therap")
}

help_word<-rep(0,nrow(complete))
for (i in 1:n) {
  help_word[i]<- str_count(complete$text[i],"help")
}

complete<-cbind(complete,girl_word,family_words,job_words,friend_words,lone_words,therapy_words,help_word)
save(complete,file="complete.RData")
load("complete.RData")

ggplot(complete, aes(post.id, girl_word,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Use of 'girl' in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

ggplot(complete, aes(post.id, family_words,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Word choice in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

ggplot(complete, aes(post.id, friend_words,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Word choice in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

ggplot(complete, aes(post.id, lone_words,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Word choice in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

ggplot(complete, aes(post.id, therapy_words,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Word choice in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

ggplot(complete, aes(post.id, help_word,color=factor(suicidal)))+
  geom_point() + 
  scale_color_brewer(palette="Set1")  +
  labs(title = "Word choice in reddit posts")+
  theme(legend.position = c(0.1, 0.9))

Clean up the dataset

#Change all word counts to log
predictors<-colnames(complete)[11:40]

n<-length(predictors)
mut_text=""

for (i in 1:n){
  currvar <-predictors[i]
  if (i==n){
    mut_text = paste(mut_text,'mutate(', currvar, '=log(',currvar, '+1))', sep="")
  }else{
    mut_text = paste(mut_text,'mutate(', currvar, '=log(',currvar, '+1)) %>% ', sep="")
  }
}


complete2<- complete %>%
  mutate(wc=log(wc+1)) %>%
  mutate(score=log(score+100)) %>%
#  mutate(grade=log(grade+5)) %>%
  mutate(age=log(age)) %>% 
  mutate(a_word = log(a_word+1)) %>%
  mutate(is_word =log(is_word+1)) %>% 
  mutate(are_word = log(are_word+1)) %>%
  mutate(can_word=log(can_word+1)) %>%
  mutate(to_word=log(to_word+1)) %>%
  mutate(and_word=log(and_word+1)) %>%
  mutate(want_word = log(want_word+1)) %>%  
  mutate(people_word=log(people_word+1)) %>%
  mutate(person_word = log(person_word+1)) %>%
  mutate(hope_word=log(hope_word+1)) %>%
  mutate(advice_word = log(advice_word+1)) %>%
  mutate(positive_word = log(positive_word+1)) %>%
  mutate(anxiety_word = log(anxiety_word+1)) %>%
  mutate(kill_word=log(kill_word+1)) %>%
  mutate(die_word=log(die_word+1)) %>% #
  mutate(anymore_word = log(anymore_word+1)) %>%
  mutate(life_word = log(life_word+1)) %>%
  mutate(fucking_word = log(fucking_word+1)) %>%
  mutate(dont_word = log(dont_word+1)) %>%
  mutate(first_pronouns=log(first_pronouns+1)) %>%
  mutate(sec_pronouns=log(sec_pronouns+1)) %>%
  mutate(third_pronouns=log(third_pronouns+1)) %>%
  mutate(it_word=log(it_word+1)) %>% #
  mutate(her_word=log(her_word+1)) %>%
  mutate(girl_word=log(girl_word+1)) %>%
  mutate(family_words=log(family_words+1)) %>%
  mutate(job_words=log(job_words+1)) %>%
  mutate(friend_words = log(friend_words+1)) %>% 
  mutate(lone_words = log(lone_words+1)) %>% #
  mutate(therapy_words=log(therapy_words+1)) %>%
  mutate(help_word=log(help_word+1))

save(complete2,file = "complete2.RData")

Graph each log-word count

load("complete2.RData")

log_predictors <-colnames(complete2[,10:40])

#No y-limit
for (i in 1:length(log_predictors)){
  curr_var <-log_predictors[i]
  eval(parse(text=paste('p',i,'<- ggplot(complete2, aes(post.id, ',curr_var,',color=factor(suicidal)))+geom_point() +ylim(0,10)+scale_color_brewer(palette="Set1")',sep="")))
}

#They've all been logged
for (i in 1:length(log_predictors)){
  curr_var <-log_predictors[i]
  eval(parse(text=paste('print(p',i,')',sep="")))
}

Final clean-up
load("complete.RData")

#Edit the dataframe to change 'suicidal' to 1,
load("complete.RData")
complete$suicidal<-gsub("not suicidal",0, complete$suicidal)
complete$suicidal<-gsub("suicidal",1, complete$suicidal)
complete$suicidal<-as.integer(complete$suicidal)
save(complete, file="complete.RData")

load("complete2.RData")
complete2$suicidal<-gsub("not suicidal",0, complete2$suicidal)
complete2$suicidal<-gsub("suicidal",1, complete2$suicidal)
complete2$suicidal<-as.integer(complete2$suicidal)
save(complete2, file="complete2.RData")

######################

Analysis

#Prepare the datasets
load("complete2.RData")

#Select the outcome and predictors
data.for.analysis <-complete2 %>%
  select(suicidal)
data.for.analysis <-cbind(data.for.analysis,complete2[,7:40])

save(data.for.analysis,file = "Data_for_analysis.RData")
load("Data_for_analysis.RData")

##############

#Get your training and test sets
set.seed(1)
train.indices<-sample(1368,1024)
training.set <-data.for.analysis[train.indices,]
test.set <-data.for.analysis[-train.indices,]

#Fit the full model
fit_full<-glm(suicidal~.,data=training.set,family=binomial)
summary(fit_full)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8857  -0.5292  -0.2558   0.3963   2.8743  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.44843    1.73561  -2.563 0.010376 *  
## wc              0.20403    0.45725   0.446 0.655440    
## score           0.06910    0.31755   0.218 0.827742    
## age             0.47414    0.27203   1.743 0.081341 .  
## a_word         -0.20037    0.33190  -0.604 0.546044    
## is_word        -0.51352    0.22106  -2.323 0.020182 *  
## are_word        0.05483    0.28871   0.190 0.849367    
## can_word       -0.08188    0.27946  -0.293 0.769533    
## to_word         0.02976    0.22498   0.132 0.894767    
## and_word       -0.57828    0.21436  -2.698 0.006983 ** 
## want_word       1.26747    0.25512   4.968 6.76e-07 ***
## people_word    -0.72366    0.27134  -2.667 0.007655 ** 
## person_word    -1.22592    0.40391  -3.035 0.002404 ** 
## hope_word      -0.34760    0.43962  -0.791 0.429138    
## advice_word    -2.07378    0.71325  -2.907 0.003643 ** 
## positive_word  -2.97012    0.89981  -3.301 0.000964 ***
## anxiety_word   -0.66357    0.38410  -1.728 0.084062 .  
## kill_word       3.61617    0.45457   7.955 1.79e-15 ***
## die_word        1.10130    0.18364   5.997 2.01e-09 ***
## anymore_word    1.83588    0.50688   3.622 0.000292 ***
## life_word       0.88977    0.26926   3.304 0.000952 ***
## fucking_word    1.22195    0.49813   2.453 0.014165 *  
## dont_word      -0.16658    0.23545  -0.708 0.479246    
## first_pronouns  0.91727    0.23109   3.969 7.21e-05 ***
## sec_pronouns   -0.32148    0.15503  -2.074 0.038115 *  
## third_pronouns  0.08423    0.16888   0.499 0.617933    
## it_word         0.14888    0.21487   0.693 0.488372    
## her_word       -0.36635    0.16966  -2.159 0.030826 *  
## girl_word       0.04711    0.43459   0.108 0.913679    
## family_words    0.18362    0.25842   0.711 0.477363    
## job_words      -0.84854    0.33689  -2.519 0.011778 *  
## friend_words    0.06332    0.27134   0.233 0.815483    
## lone_words      0.41442    0.36885   1.124 0.261207    
## therapy_words  -1.26600    0.79588  -1.591 0.111681    
## help_word      -0.08585    0.28789  -0.298 0.765538    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1406.85  on 1023  degrees of freedom
## Residual deviance:  717.39  on  989  degrees of freedom
## AIC: 787.39
## 
## Number of Fisher Scoring iterations: 6
#Select a model using backwards stepwise selection
fit_reduced = step(fit_full,trace=0)
summary(fit_reduced)
## 
## Call:
## glm(formula = suicidal ~ age + is_word + and_word + want_word + 
##     people_word + person_word + advice_word + positive_word + 
##     anxiety_word + kill_word + die_word + anymore_word + life_word + 
##     fucking_word + first_pronouns + sec_pronouns + her_word + 
##     job_words + therapy_words, family = binomial, data = training.set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7409  -0.5380  -0.2591   0.3944   2.8022  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.9790     0.6477  -6.143 8.09e-10 ***
## age              0.4685     0.2619   1.789 0.073561 .  
## is_word         -0.4810     0.2102  -2.288 0.022126 *  
## and_word        -0.5610     0.1915  -2.929 0.003395 ** 
## want_word        1.2998     0.2393   5.432 5.57e-08 ***
## people_word     -0.6354     0.2530  -2.511 0.012028 *  
## person_word     -1.2111     0.3913  -3.095 0.001967 ** 
## advice_word     -2.0931     0.6777  -3.088 0.002013 ** 
## positive_word   -3.0176     0.8673  -3.479 0.000503 ***
## anxiety_word    -0.6872     0.3688  -1.864 0.062378 .  
## kill_word        3.6636     0.4500   8.142 3.89e-16 ***
## die_word         1.1538     0.1782   6.474 9.56e-11 ***
## anymore_word     1.8754     0.4890   3.835 0.000125 ***
## life_word        0.8749     0.2564   3.412 0.000646 ***
## fucking_word     1.2409     0.4828   2.570 0.010156 *  
## first_pronouns   1.0226     0.1491   6.859 6.96e-12 ***
## sec_pronouns    -0.3345     0.1355  -2.468 0.013594 *  
## her_word        -0.2909     0.1556  -1.869 0.061603 .  
## job_words       -0.7721     0.3236  -2.386 0.017036 *  
## therapy_words   -1.3362     0.7567  -1.766 0.077401 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1406.85  on 1023  degrees of freedom
## Residual deviance:  722.19  on 1004  degrees of freedom
## AIC: 762.19
## 
## Number of Fisher Scoring iterations: 6
formula(fit_reduced)
## suicidal ~ age + is_word + and_word + want_word + people_word + 
##     person_word + advice_word + positive_word + anxiety_word + 
##     kill_word + die_word + anymore_word + life_word + fucking_word + 
##     first_pronouns + sec_pronouns + her_word + job_words + therapy_words
training.subset <-training.set %>%
  select(c(suicidal,age , is_word , and_word , want_word , people_word , 
           person_word , advice_word , positive_word , anxiety_word , 
           kill_word , die_word , anymore_word , life_word , fucking_word , 
           first_pronouns , sec_pronouns , her_word , job_words , therapy_words))
Check model assumptions
#Check for collinearity
vif(fit_reduced)
##            age        is_word       and_word      want_word    people_word 
##       1.118508       1.771984       3.530709       1.507238       1.358033 
##    person_word    advice_word  positive_word   anxiety_word      kill_word 
##       1.355272       1.096909       1.128786       1.185493       1.121491 
##       die_word   anymore_word      life_word   fucking_word first_pronouns 
##       1.372880       1.100012       1.456701       1.087341       3.153895 
##   sec_pronouns       her_word      job_words  therapy_words 
##       1.129326       1.476929       1.304697       1.057962
# get rid of 'first_pronouns" and 'and_words' as a predictor
#since the VIF are > 2.5

#Refit model
training.subset2 <-training.subset %>%
  select(-c(and_word,first_pronouns))

fit_reduced2<-glm(suicidal~.,data=training.subset2,family=binomial)
summary(fit_reduced2)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0174  -0.5903  -0.3639   0.4281   2.7556  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.1506     0.5577  -5.649 1.61e-08 ***
## age             0.6587     0.2301   2.862 0.004207 ** 
## is_word        -0.2366     0.1991  -1.188 0.234729    
## want_word       1.5662     0.2292   6.833 8.33e-12 ***
## people_word    -0.4371     0.2431  -1.798 0.072226 .  
## person_word    -1.1131     0.3953  -2.816 0.004864 ** 
## advice_word    -1.7666     0.6788  -2.603 0.009253 ** 
## positive_word  -3.0162     0.8869  -3.401 0.000672 ***
## anxiety_word   -0.4394     0.3697  -1.188 0.234641    
## kill_word       3.7599     0.4181   8.993  < 2e-16 ***
## die_word        1.3668     0.1712   7.983 1.43e-15 ***
## anymore_word    2.2008     0.4898   4.494 7.01e-06 ***
## life_word       1.0966     0.2472   4.436 9.14e-06 ***
## fucking_word    1.3873     0.4933   2.812 0.004918 ** 
## sec_pronouns   -0.4352     0.1328  -3.278 0.001045 ** 
## her_word       -0.1849     0.1521  -1.215 0.224206    
## job_words      -0.5572     0.3235  -1.722 0.084992 .  
## therapy_words  -1.2466     0.7912  -1.576 0.115131    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1406.85  on 1023  degrees of freedom
## Residual deviance:  777.56  on 1006  degrees of freedom
## AIC: 813.56
## 
## Number of Fisher Scoring iterations: 6
#Use BSS to drop non-significant predictors
training.subset3 <- training.subset2 %>%
  select(-is_word)

fit_reduced3 <-glm(suicidal~.,data=training.subset3,family=binomial)
summary(fit_reduced3)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.9944  -0.5933  -0.3638   0.4335   2.6387  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.1706     0.5579  -5.683 1.32e-08 ***
## age             0.6568     0.2301   2.854 0.004314 ** 
## want_word       1.5385     0.2272   6.772 1.27e-11 ***
## people_word    -0.4739     0.2410  -1.967 0.049229 *  
## person_word    -1.1241     0.3933  -2.858 0.004259 ** 
## advice_word    -1.7637     0.6713  -2.627 0.008602 ** 
## positive_word  -3.0874     0.8854  -3.487 0.000489 ***
## anxiety_word   -0.4792     0.3700  -1.295 0.195260    
## kill_word       3.6893     0.4110   8.976  < 2e-16 ***
## die_word        1.3397     0.1693   7.915 2.47e-15 ***
## anymore_word    2.1556     0.4899   4.401 1.08e-05 ***
## life_word       1.0263     0.2389   4.295 1.74e-05 ***
## fucking_word    1.3751     0.4946   2.780 0.005434 ** 
## sec_pronouns   -0.4595     0.1308  -3.512 0.000444 ***
## her_word       -0.2188     0.1498  -1.461 0.143997    
## job_words      -0.5554     0.3239  -1.715 0.086342 .  
## therapy_words  -1.3257     0.7792  -1.701 0.088887 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1406.85  on 1023  degrees of freedom
## Residual deviance:  778.99  on 1007  degrees of freedom
## AIC: 812.99
## 
## Number of Fisher Scoring iterations: 6
training.subset4 <- training.subset3 %>%
  select(-anxiety_word)

fit_reduced4 = glm(suicidal~.,data=training.subset4,family=binomial)
summary(fit_reduced4)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0923  -0.5916  -0.3657   0.4474   2.6409  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.1011     0.5538  -5.600 2.14e-08 ***
## age             0.6245     0.2285   2.733 0.006280 ** 
## want_word       1.5141     0.2257   6.709 1.97e-11 ***
## people_word    -0.4606     0.2403  -1.917 0.055255 .  
## person_word    -1.1058     0.3926  -2.817 0.004854 ** 
## advice_word    -1.8062     0.6750  -2.676 0.007452 ** 
## positive_word  -3.1450     0.9151  -3.437 0.000589 ***
## kill_word       3.6479     0.4073   8.956  < 2e-16 ***
## die_word        1.3123     0.1668   7.870 3.56e-15 ***
## anymore_word    2.1474     0.4906   4.378 1.20e-05 ***
## life_word       1.0115     0.2382   4.247 2.17e-05 ***
## fucking_word    1.3453     0.4972   2.706 0.006815 ** 
## sec_pronouns   -0.4622     0.1310  -3.527 0.000420 ***
## her_word       -0.2181     0.1489  -1.464 0.143059    
## job_words      -0.5711     0.3236  -1.765 0.077593 .  
## therapy_words  -1.4014     0.8050  -1.741 0.081704 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1406.85  on 1023  degrees of freedom
## Residual deviance:  780.64  on 1008  degrees of freedom
## AIC: 812.64
## 
## Number of Fisher Scoring iterations: 6
training.subset5 <- training.subset4 %>%
  select(-her_word)

fit_reduced5 = glm(suicidal~.,data=training.subset5,family=binomial)
summary(fit_reduced5)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0381  -0.5932  -0.3634   0.4473   2.5511  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.1118     0.5529  -5.628 1.82e-08 ***
## age             0.6208     0.2281   2.722 0.006495 ** 
## want_word       1.4498     0.2207   6.570 5.04e-11 ***
## people_word    -0.4523     0.2398  -1.886 0.059284 .  
## person_word    -1.1527     0.3904  -2.952 0.003152 ** 
## advice_word    -1.8904     0.6767  -2.793 0.005217 ** 
## positive_word  -3.1635     0.9348  -3.384 0.000714 ***
## kill_word       3.6113     0.4072   8.870  < 2e-16 ***
## die_word        1.2947     0.1655   7.821 5.22e-15 ***
## anymore_word    2.1228     0.4903   4.330 1.49e-05 ***
## life_word       0.9609     0.2358   4.074 4.61e-05 ***
## fucking_word    1.3157     0.4990   2.636 0.008381 ** 
## sec_pronouns   -0.4557     0.1305  -3.492 0.000479 ***
## job_words      -0.6231     0.3256  -1.914 0.055672 .  
## therapy_words  -1.4103     0.8007  -1.761 0.078165 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1406.8  on 1023  degrees of freedom
## Residual deviance:  782.8  on 1009  degrees of freedom
## AIC: 812.8
## 
## Number of Fisher Scoring iterations: 6
Check for outliers
Deviance Residuals
#Deviance Residuals identify observations not well explained by the model.
resids.deviance<-residuals(fit_reduced5, type = c("deviance"))
plot(resids.deviance,training.subset5$post.id)

#a couple of outliers<-3 but nothing too crazy

predictor.names<-colnames(training.subset5)[-1]
n<-length(predictor.names)
#Make plots
for (i in 1:n){
  currvar <- predictor.names[i]
  eval(parse(text=paste('plot(training.subset5$',currvar,',resids.deviance)',sep="")))
}

Overall seems to not have many outliers…most within [-3,3], a few at -4

Hat Diagonals
#Hat Matrix Diagonal detects extreme large points in the design space.
#These are often labeled as "leverage" or "hi" and are related to standardized residuals.
#A general rule says that if hi > 2*p/n or > 3*p/n the points is influential.
#Here "p" is the number of parameters in the model and "n" the number of observations.

#Here, 3*20/1024 = 0.05859375

#hats<-influence.measures(fit_reduced)$hat
hats<-hatvalues(fit_reduced5)
training.subset5 <- training.subset5 %>% mutate(index = row_number())
plot(hats,training.subset5$index)

training.subset5 <- training.subset5 %>% 
  select(-index)

There are quite a lot of possibly influential points. We will get rid of only the extreme outliers.

#Check if removing them does anything to the model
training.subset6<- cbind(training.subset5,resids.deviance,hats) %>%
  filter(!resids.deviance < -3) %>%
  filter(!hats > 0.2) %>%
  select(-c(resids.deviance,hats))


fit6 = glm(suicidal~.,data=training.subset6,family=binomial)
summary(fit6)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset6)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6180  -0.5504  -0.3473   0.3439   2.6400  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.1969     0.5772  -5.539 3.04e-08 ***
## age             0.5865     0.2380   2.464 0.013735 *  
## want_word       1.4990     0.2327   6.442 1.18e-10 ***
## people_word    -0.3750     0.2514  -1.492 0.135823    
## person_word    -0.9363     0.4132  -2.266 0.023444 *  
## advice_word    -1.1724     0.7562  -1.550 0.121036    
## positive_word  -3.0843     0.9920  -3.109 0.001877 ** 
## kill_word       4.3369     0.4877   8.892  < 2e-16 ***
## die_word        1.4581     0.1762   8.277  < 2e-16 ***
## anymore_word    1.9412     0.5062   3.835 0.000126 ***
## life_word       1.0824     0.2490   4.347 1.38e-05 ***
## fucking_word    2.0863     0.6667   3.129 0.001753 ** 
## sec_pronouns   -0.4822     0.1370  -3.520 0.000431 ***
## job_words      -0.7446     0.3519  -2.116 0.034373 *  
## therapy_words  -1.3182     1.1235  -1.173 0.240665    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1398.16  on 1016  degrees of freedom
## Residual deviance:  721.35  on 1002  degrees of freedom
## AIC: 751.35
## 
## Number of Fisher Scoring iterations: 6
#It does, so we remove the outliers and the predictors
#Therapy, advice, and people should be dropped
#Use BSS

training.subset7 <- training.subset6 %>%
  select(-therapy_words)

fit_reduced7 = glm(suicidal~.,data=training.subset7,family=binomial)
summary(fit_reduced7)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset7)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6029  -0.5493  -0.3518   0.3411   2.6455  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.1815     0.5768  -5.516 3.47e-08 ***
## age             0.5770     0.2378   2.426 0.015271 *  
## want_word       1.5028     0.2322   6.472 9.66e-11 ***
## people_word    -0.3627     0.2515  -1.442 0.149289    
## person_word    -0.9112     0.4131  -2.206 0.027389 *  
## advice_word    -1.1850     0.7632  -1.553 0.120505    
## positive_word  -3.0449     0.9859  -3.089 0.002011 ** 
## kill_word       4.3295     0.4888   8.858  < 2e-16 ***
## die_word        1.4363     0.1745   8.229  < 2e-16 ***
## anymore_word    1.9445     0.5072   3.834 0.000126 ***
## life_word       1.0779     0.2489   4.331 1.49e-05 ***
## fucking_word    2.0894     0.6678   3.129 0.001754 ** 
## sec_pronouns   -0.4855     0.1369  -3.547 0.000389 ***
## job_words      -0.7626     0.3514  -2.170 0.029984 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1398.16  on 1016  degrees of freedom
## Residual deviance:  722.76  on 1003  degrees of freedom
## AIC: 750.76
## 
## Number of Fisher Scoring iterations: 6
training.subset8 <- training.subset7 %>%
  select(-people_word)

fit_reduced8 = glm(suicidal~.,data=training.subset8,family=binomial)
summary(fit_reduced8)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset8)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6246  -0.5459  -0.3496   0.3593   2.6851  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.1435     0.5727  -5.489 4.04e-08 ***
## age             0.5513     0.2358   2.338 0.019405 *  
## want_word       1.4412     0.2274   6.339 2.32e-10 ***
## person_word    -0.9912     0.4093  -2.422 0.015443 *  
## advice_word    -1.1507     0.7681  -1.498 0.134092    
## positive_word  -2.9716     0.9710  -3.060 0.002211 ** 
## kill_word       4.3263     0.4886   8.854  < 2e-16 ***
## die_word        1.4173     0.1731   8.188 2.65e-16 ***
## anymore_word    1.9050     0.4996   3.813 0.000137 ***
## life_word       1.0630     0.2484   4.279 1.88e-05 ***
## fucking_word    2.0111     0.6567   3.062 0.002197 ** 
## sec_pronouns   -0.5154     0.1347  -3.826 0.000130 ***
## job_words      -0.7815     0.3506  -2.229 0.025804 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1398.16  on 1016  degrees of freedom
## Residual deviance:  724.88  on 1004  degrees of freedom
## AIC: 750.88
## 
## Number of Fisher Scoring iterations: 6
training.subset9 <- training.subset8 %>%
  select(-advice_word)

fit_reduced9 = glm(suicidal~.,data=training.subset9,family=binomial)
summary(fit_reduced9)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset9)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6002  -0.5447  -0.3543   0.3430   2.7054  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.1232     0.5706  -5.473 4.42e-08 ***
## age             0.5409     0.2350   2.302 0.021348 *  
## want_word       1.4144     0.2253   6.278 3.42e-10 ***
## person_word    -0.9980     0.4061  -2.458 0.013977 *  
## positive_word  -2.9364     0.9663  -3.039 0.002375 ** 
## kill_word       4.3209     0.4888   8.840  < 2e-16 ***
## die_word        1.4067     0.1723   8.166 3.18e-16 ***
## anymore_word    1.8907     0.4970   3.804 0.000142 ***
## life_word       1.0449     0.2459   4.248 2.15e-05 ***
## fucking_word    2.0486     0.6572   3.117 0.001827 ** 
## sec_pronouns   -0.5331     0.1342  -3.972 7.13e-05 ***
## job_words      -0.7492     0.3476  -2.155 0.031143 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1398.16  on 1016  degrees of freedom
## Residual deviance:  727.26  on 1005  degrees of freedom
## AIC: 751.26
## 
## Number of Fisher Scoring iterations: 6
Check linear relationship between the logit of the response and the predictors
#This looks non-linear because of the predictors
predictor.names<-colnames(training.subset9)[-1]
n<-length(predictor.names)

mut_text=""
#Means
for (i in 1:n){
  if (i==n){
    currvar<-predictor.names[i]
    mut_text = paste(mut_text,'mutate(', currvar, '=mean(training.subset9$',currvar,'))', sep="")
  } else {
    currvar<-predictor.names[i]
    mut_text = paste(mut_text,'mutate(', currvar, '=mean(training.subset9$',currvar,')) %>% ', sep="")
  }
}

mean_data_text <- paste('mean_data<- training.subset9 %>% ',mut_text,sep="")
mean_data_text
## [1] "mean_data<- training.subset9 %>% mutate(age=mean(training.subset9$age)) %>% mutate(want_word=mean(training.subset9$want_word)) %>% mutate(person_word=mean(training.subset9$person_word)) %>% mutate(positive_word=mean(training.subset9$positive_word)) %>% mutate(kill_word=mean(training.subset9$kill_word)) %>% mutate(die_word=mean(training.subset9$die_word)) %>% mutate(anymore_word=mean(training.subset9$anymore_word)) %>% mutate(life_word=mean(training.subset9$life_word)) %>% mutate(fucking_word=mean(training.subset9$fucking_word)) %>% mutate(sec_pronouns=mean(training.subset9$sec_pronouns)) %>% mutate(job_words=mean(training.subset9$job_words))"
eval(parse(text=mean_data_text))

#Make plots
for (i in 1:n){
  currvar <- predictor.names[i]
  othervar <-predictor.names[-i]
  
  #Add the non-modified currvar to the temp
  temp <- mean_data
  eval(parse(text=paste('temp$',currvar,'<-training.subset9$',currvar,sep="")))
  
  #Get the predictions, with the other variables held constant
  predictions <- predict(fit_reduced9,temp,type="response")
  log.odds<-predictions
  temp <- temp %>% mutate(log.odds = log.odds)
  #Get the plot
  eval(parse(text=paste('plot(temp$',currvar,',temp$log.odds)',sep="")))
}

Kill is not linear, so we drop it to avoid biasing the model too much Fucking,anymore, die, an want are not super linear either, but they aren’t as extreme and are very predictive, so we keep them.

Final model

training.subset10 <- training.subset9 %>%
  #select(-c(fucking_word,anymore_word, die_word, kill_word, want_word))
  select(-kill_word)

fit_reduced10<-glm(suicidal~.,data=training.subset10,family=binomial)
summary(fit_reduced10)
## 
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset10)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8939  -0.6442  -0.4338   0.5301   2.5246  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -2.9049     0.5240  -5.544 2.96e-08 ***
## age             0.5870     0.2165   2.711 0.006699 ** 
## want_word       1.6681     0.2003   8.329  < 2e-16 ***
## person_word    -0.8294     0.3540  -2.343 0.019133 *  
## positive_word  -2.5071     0.7601  -3.298 0.000973 ***
## die_word        1.2780     0.1604   7.968 1.61e-15 ***
## anymore_word    2.0394     0.4430   4.604 4.15e-06 ***
## life_word       1.0977     0.2224   4.936 7.97e-07 ***
## fucking_word    2.2975     0.6131   3.748 0.000179 ***
## sec_pronouns   -0.4770     0.1190  -4.009 6.08e-05 ***
## job_words      -0.5213     0.3240  -1.609 0.107570    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1398.16  on 1016  degrees of freedom
## Residual deviance:  865.41  on 1006  degrees of freedom
## AIC: 887.41
## 
## Number of Fisher Scoring iterations: 6
fit_final<- fit_reduced10

ROC curves

#ROC curve
train.predictions <- predict(fit_final,training.subset10,type="response")

n<-length(train.predictions)
results<-rep(1,n)
for (i in 1:n){
  #Classify >0.5 as suicidal
  if (train.predictions[i]>=0.5){
    results[i]=1
  } else {
    #Classify <0.5 as not suicidal
    results[i]=0
  }
}

table(results, training.subset10$suicidal)
##        
## results   0   1
##       0 503 121
##       1  60 333
plot(roc(training.subset10$suicidal,results),main="ROC curve for training data")

## 
## Call:
## roc.default(response = training.subset10$suicidal, predictor = results)
## 
## Data: results in 563 controls (training.subset10$suicidal 0) < 454 cases (training.subset10$suicidal 1).
## Area under the curve: 0.8135
auc(training.subset10$suicidal, results) # 0.8135
## Area under the curve: 0.8135
colnames(training.subset10)
##  [1] "suicidal"      "age"           "want_word"     "person_word"  
##  [5] "positive_word" "die_word"      "anymore_word"  "life_word"    
##  [9] "fucking_word"  "sec_pronouns"  "job_words"
#Test set
test.set.for.analysis <- test.set %>%
  select(c(suicidal,age,want_word, person_word,   positive_word,
           die_word,anymore_word,  life_word, fucking_word,  sec_pronouns,
           job_words))

test.predictions <- predict(fit_final,test.set.for.analysis,type="response")

n<-length(test.set.for.analysis$suicidal)
results<-rep(1,n)
for (i in 1:n){
  #Classify >0.5 as suicidal
  if (test.predictions[i]>=0.5){
    results[i]=1
  } else {
    #Classify <0.5 as not suicidal
    results[i]=0
  }
}
table(results,test.set.for.analysis$suicidal)
##        
## results   0   1
##       0 167  43
##       1  21 113
plot(roc(test.set.for.analysis$suicidal,results),main="ROC curve for test data")

## 
## Call:
## roc.default(response = test.set.for.analysis$suicidal, predictor = results)
## 
## Data: results in 188 controls (test.set.for.analysis$suicidal 0) < 156 cases (test.set.for.analysis$suicidal 1).
## Area under the curve: 0.8063
auc(test.set.for.analysis$suicidal, results) #0.8063
## Area under the curve: 0.8063